In [1]:
%matplotlib inline
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from collections import Counter

#import colorlover as cl

from IPython.display import HTML, display

from chorogrid import Colorbin, Chorogrid

In [2]:
sns.set_context("poster")
sns.set_style("ticks")

In [3]:
TOPIC_MAPPING={
    "GunControl": "Gun Control",
    "Privacy": "Privacy",
    "Vaccine": "Vaccine",
    "ChildEducation": "Child Education",
    "SkinDamage": "Skin Damage",
    "SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
             "Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
        #u'is_controvertial': u'is_controversial'
    }).assign(
    topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
NON_STATES = set(["UNK", "USA", "AS", "DC", "GU",
              "MP", "PR", "VI"])


STATE_POPULATIONS="""4863300.00	AL
741894.00	AK
6931071.00	AZ
2988248.00	AR
39250017.00	CA
5540545.00	CO
3576452.00	CT
952065.00	DE
681170.00	DC
20612439.00	FL
10310371.00	GA
1428557.00	HI
1683140.00	ID
12801539.00	IL
6633053.00	IN
3134693.00	IA
2907289.00	KS
4436974.00	KY
4681666.00	LA
1331479.00	ME
6016447.00	MD
6811779.00	MA
9928300.00	MI
5519952.00	MN
2988726.00	MS
6093000.00	MO
1042520.00	MT
1907116.00	NE
2940058.00	NV
1334795.00	NH
8944469.00	NJ
2081015.00	NM
19745289.00	NY
10146788.00	NC
757952.00	ND
11614373.00	OH
3923561.00	OK
4093465.00	OR
12784227.00	PA
1056426.00	RI
4961119.00	SC
865454.00	SD
6651194.00	TN
27862596.00	TX
3051217.00	UT
624594.00	VT
8411808.00	VA
7288000.00	WA
1831102.00	WV
5778708.00	WI
585501.00	WY
""".splitlines()

STATE_POPULATIONS = {k:float(v) for v,k in map(lambda x: x.split('\t'), STATE_POPULATIONS)}

CHOROGRID_STATES_FILE='/content/Code/smishra8/chorogrid/chorogrid/databases/usa_states.csv'

In [4]:
STATE_POPULATIONS["AZ"]


Out[4]:
6931071.0

In [5]:
df.columns


Out[5]:
Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS',          u'u_state'],
      dtype='object')

In [6]:
df.CATS.fillna(0).apply(
    lambda x: Counter(['UNK']) 
    if x == 0 
    else Counter(x)
).apply(lambda x: len(x)).describe()


Out[6]:
count    246869.000000
mean          1.139163
std           0.356983
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           5.000000
Name: CATS, dtype: float64

In [7]:
df["CATS_Counter"] = df.CATS.fillna(0).apply(
    lambda x: Counter(['NONE']) 
    if x == 0 
    else Counter(x)
)
df[df.CATS_Counter.apply(lambda x: len(x)) == 2]["CATS_Counter"].head()


Out[7]:
23     {u'socialmedia': 1, u'videos': 1}
29    {u'twitter': 1, u'socialmedia': 1}
38     {u'socialmedia': 1, u'videos': 1}
53     {u'socialmedia': 1, u'videos': 1}
54    {u'twitter': 1, u'socialmedia': 1}
Name: CATS_Counter, dtype: object

Chorogrid plot


In [8]:
df_t = df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()
df_t


Out[8]:
u_state mean len std
0 AK 0.618557 291 0.486578
1 AL 0.587302 1449 0.492489
2 AR 0.634062 869 0.481970
3 AS 0.300000 10 0.483046
4 AZ 0.573411 3242 0.494658
5 CA 0.654161 22123 0.475652
6 CO 0.617085 2669 0.486189
7 CT 0.594237 1284 0.491230
8 DC 0.822330 5150 0.382272
9 DE 0.543796 274 0.498990
10 FL 0.599461 8913 0.490035
11 GA 0.549006 4377 0.497649
12 GU 0.000000 5 0.000000
13 HI 0.555777 502 0.497375
14 IA 0.562099 934 0.496395
15 ID 0.611529 399 0.488015
16 IL 0.567998 5331 0.495401
17 IN 0.526071 3222 0.499397
18 KS 0.394879 1484 0.488989
19 KY 0.572368 1368 0.494916
20 LA 0.468478 1951 0.499133
21 MA 0.632763 3913 0.482114
22 MD 0.556561 2431 0.496893
23 ME 0.650817 673 0.477066
24 MI 0.574339 3141 0.494522
25 MN 0.572537 1675 0.494858
26 MO 0.564935 2002 0.495889
27 MP 1.000000 2 0.000000
28 MS 0.567430 786 0.495748
29 MT 0.674740 289 0.469284
30 NC 0.612668 3568 0.487209
31 ND 0.456250 160 0.499646
32 NE 0.458272 683 0.498621
33 NH 0.682051 585 0.466078
34 NJ 0.589065 3402 0.492076
35 NM 0.636574 432 0.481544
36 NV 0.588895 1693 0.492180
37 NY 0.656886 14689 0.474765
38 OH 0.580961 4601 0.493455
39 OK 0.555787 1443 0.497050
40 OR 0.651773 2171 0.476518
41 PA 0.606705 4653 0.488534
42 PR 0.625000 24 0.494535
43 RI 0.600000 455 0.490437
44 SC 0.573099 1539 0.494788
45 SD 0.405063 237 0.491943
46 TN 0.514493 2622 0.499885
47 TX 0.569604 11666 0.495153
48 UT 0.482531 1059 0.499931
49 VA 0.639094 3796 0.480327
50 VI 0.000000 5 0.000000
51 VT 0.715827 278 0.451833
52 WA 0.701355 5093 0.457709
53 WI 0.624379 1813 0.484416
54 WV 0.556923 325 0.497515
55 WY 0.606936 173 0.489849

In [9]:
mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
mybin = Colorbin(df_t['mean'], mycolors, proportional=True, decimals=None)
mybin.set_decimals(1)
mybin.recalc(fenceposts=True)
mybin.calc_complements(0.5, '#e0e0e0', '#101010')

In [10]:
states = list(df_t.u_state)
colors_by_state = mybin.colors_out
font_colors_by_state = mybin.complements
legend_colors = mybin.colors_in
legend_labels = mybin.labels

for lst in ['states', 'colors_by_state', 'font_colors_by_state', 'legend_colors', 'legend_labels']:
    obj = eval(lst)
    print("{:>20}: len {:2}: {}...".format(lst, len(obj), obj[:3]))


              states: len 56: ['AK', 'AL', 'AR']...
     colors_by_state: len 56: ['#d8daeb', '#d8daeb', '#d8daeb']...
font_colors_by_state: len 56: ['#101010', '#101010', '#101010']...
       legend_colors: len  6: ['#b35806', '#f1a340', '#fee0b6']...
       legend_labels: len  6: [u'0.0-0.2', u'0.2-0.3', u'0.3-0.5']...

In [11]:
cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
cg.set_title('mean', font_dict={'font-size': 19})
cg.set_legend(legend_colors, legend_labels, title='mean')
cg.draw_multihex(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
    # another strategy would be to pass a legend_offset to spacing_dict
cg.done(show=True)


WARNING: The following are not recognized ids: set(['PR', 'VI', 'GU', 'AS', 'MP'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 0.8-1.0 0.7-0.8 0.5-0.7 0.3-0.5 0.2-0.3 0.0-0.2 mean mean

Plot individual topic maps


In [12]:
def logit_transform(p):
    eps = 1e-8
    return np.log((p + eps)/(1-p + eps))

In [13]:
def plot_map(df, location_col, value_col, text_cols,
            scl="Portland", title="", cbar_title="", decimals=2, value_transform=None):
    #mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
    #mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
    mycolors = ['#ffffd9','#edf8b1','#c7e9b4','#7fcdbb','#41b6c4','#1d91c0','#225ea8','#253494','#081d58']
    
    values = df[value_col].astype(float)
    if value_transform:
        values = logit_transform(values)
    mybin = Colorbin(values, mycolors,
                     #proportional=True,
                     decimals=None)
    mybin.set_decimals(decimals)
    mybin.recalc(fenceposts=True)
    mybin.calc_complements(0.5, '#e0e0e0', '#101010')
    
    states = list(df[location_col])
    colors_by_state = mybin.colors_out
    font_colors_by_state = mybin.complements
    legend_colors = mybin.colors_in
    legend_labels = mybin.labels
    
    cg = Chorogrid(
        CHOROGRID_STATES_FILE,
        states, colors_by_state,
    )
    cg.set_title(title, font_dict={'font-size': 19})
    cg.set_legend(legend_colors, legend_labels, title=cbar_title,
                  font_dict={'font-size': '10px', })
    #cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
    cg.draw_multihex(spacing_dict={
        'margin_right': 150,
        'missing_color': '#ffffff',
        'stroke_color': '#000000',
        'stroke_width': 0.1
    }, font_dict={
        'stroke-width': '0.1px',
    }, font_colors=font_colors_by_state)
    cg.done(show=True)

In [14]:
df_t = df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()

plot_map(df_t,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         title="Proportion of controversial tweets per state",
         cbar_title="Proportion",
        )


WARNING: The following are not recognized ids: set(['PR', 'VI', 'GU', 'AS', 'MP'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 0.89-1.0 0.78-0.89 0.67-0.78 0.56-0.67 0.44-0.56 0.33-0.44 0.22-0.33 0.11-0.22 0.0-0.11 Proportion Proportion of controversial tweets per state

In [15]:
df_t = df.assign(
    fakenews=df.CATS_Counter.apply(lambda x: x.get('fakenews', 0))
)[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()

plot_map(df_t,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         title="Proportion of fakenews urls per state",
         cbar_title="Proportion"
        )


WARNING: The following are not recognized ids: set(['PR', 'GU', 'USA', 'VI', 'AS', 'MP'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 0.05-0.06 0.04-0.05 0.04-0.04 0.03-0.04 0.02-0.03 0.02-0.02 0.01-0.02 0.01-0.01 0.0-0.01 Proportion Proportion of fakenews urls per state

In [16]:
for url_type in ["fakenews", "news", "blog"]:
    df_t = df[(df.u_state != "USA")
        & (df.t_n_urls > 0)].assign(**{
        url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
    )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()

    plot_map(df_t[
            #(df_t["len"] > (df_t["len"].sum() * 0.01))
            (df_t["len"] >= (df_t["len"].sort_values().values[-10])) 
            & (~df_t["u_state"].isin(NON_STATES))
        ],
             "u_state", "mean", ["u_state","len", "std"], scl='Portland',
             title="Proportion of %s urls (in tweets with URLs) per state" % url_type.title(),
             cbar_title="Proportion"
            )


WARNING: The following ids in the csv are not included: set(['DE', 'DC', 'WI', 'WV', 'HI', 'WY', 'NH', 'NJ', 'NM', 'LA', 'NC', 'ND', 'NE', 'TN', 'RI', 'NV', 'CO', 'AK', 'AL', 'AR', 'VT', 'IN', 'IA', 'MA', 'AZ', 'ID', 'CT', 'ME', 'MD', 'OK', 'OH', 'UT', 'MO', 'MN', 'MI', 'KS', 'MT', 'MS', 'SC', 'KY', 'OR', 'SD'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 0.08-0.08 0.07-0.08 0.07-0.07 0.06-0.07 0.06-0.06 0.05-0.06 0.05-0.05 0.04-0.05 0.04-0.04 Proportion Proportion of Fakenews urls (in tweets with URLs) per state
WARNING: The following ids in the csv are not included: set(['DE', 'DC', 'WI', 'WV', 'HI', 'WY', 'NH', 'NJ', 'NM', 'LA', 'NC', 'ND', 'NE', 'TN', 'RI', 'NV', 'CO', 'AK', 'AL', 'AR', 'VT', 'IN', 'IA', 'MA', 'AZ', 'ID', 'CT', 'ME', 'MD', 'OK', 'OH', 'UT', 'MO', 'MN', 'MI', 'KS', 'MT', 'MS', 'SC', 'KY', 'OR', 'SD'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 0.24-0.25 0.22-0.24 0.21-0.22 0.2-0.21 0.19-0.2 0.17-0.19 0.16-0.17 0.15-0.16 0.14-0.15 Proportion Proportion of News urls (in tweets with URLs) per state
WARNING: The following ids in the csv are not included: set(['DE', 'DC', 'WI', 'WV', 'HI', 'WY', 'NH', 'NJ', 'NM', 'LA', 'NC', 'ND', 'NE', 'TN', 'RI', 'NV', 'CO', 'AK', 'AL', 'AR', 'VT', 'IN', 'IA', 'MA', 'AZ', 'ID', 'CT', 'ME', 'MD', 'OK', 'OH', 'UT', 'MO', 'MN', 'MI', 'KS', 'MT', 'MS', 'SC', 'KY', 'OR', 'SD'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 0.08-0.08 0.07-0.08 0.07-0.07 0.06-0.07 0.06-0.06 0.06-0.06 0.05-0.06 0.05-0.05 0.04-0.05 Proportion Proportion of Blog urls (in tweets with URLs) per state

Split by topics


In [17]:
df.topic_name.value_counts()


Out[17]:
Privacy            73593
Seat Belt          73270
Vaccine            40713
Gun Control        34357
Skin Damage        14128
Child Education    10808
Name: topic_name, dtype: int64

In [18]:
def plot_by_topic(df, url_type, nstates=10):
    display(HTML("<h2>{}</h2>".format(url_type.upper())))
    total_population = sum(STATE_POPULATIONS.values())
    for topic in topic_order:
        df_t = df[(df.u_state != "USA") 
                  & (df.t_n_urls > 0)
                  & (df.topic_name == topic)
                 ].assign(
            **{
                url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
        })[["u_state", url_type]].groupby("u_state")[url_type].agg([np.sum, np.mean, len, np.std]).reset_index()
        df_t["value_rank"] = df_t["mean"].rank(ascending=False)
        df_t = df_t.assign(mean=df_t["sum"] * total_population/df_t["u_state"].apply(
            lambda k: STATE_POPULATIONS.get(k, total_population))) 
        plot_map(df_t[
                #(df_t["len"] > (df_t["len"].sum() * 0.01))
                #(df_t["len"] >= (df_t["len"].sort_values().values[nstates]))
                (~df_t["u_state"].isin(NON_STATES))

            ],
             "u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
             title=topic,
             cbar_title="Proportion",
                decimals=3,
        )
        
nstates=None

Fake News Maps


In [19]:
url_type = "fakenews"
plot_by_topic(df, url_type, nstates=nstates)


FAKENEWS

WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 2149.256-2417.913 1880.599-2149.256 1611.942-1880.599 1343.285-1611.942 1074.628-1343.285 805.971-1074.628 537.314-805.971 268.657-537.314 0.0-268.657 Proportion Gun Control
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3097.202-3484.352 2710.052-3097.202 2322.902-2710.052 1935.751-2322.902 1548.601-1935.751 1161.451-1548.601 774.301-1161.451 387.15-774.301 0.0-387.15 Proportion Privacy
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3712.352-4176.396 3248.308-3712.352 2784.264-3248.308 2320.22-2784.264 1856.176-2320.22 1392.132-1856.176 928.088-1392.132 464.044-928.088 0.0-464.044 Proportion Vaccine
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 459.858-517.34 402.376-459.858 344.893-402.376 287.411-344.893 229.929-287.411 172.447-229.929 114.964-172.447 57.482-114.964 0.0-57.482 Proportion Child Education
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 91.628-103.081 80.174-91.628 68.721-80.174 57.267-68.721 45.814-57.267 34.36-45.814 22.907-34.36 11.453-22.907 0.0-11.453 Proportion Skin Damage
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 215.182-242.08 188.285-215.182 161.387-188.285 134.489-161.387 107.591-134.489 80.693-107.591 53.796-80.693 26.898-53.796 0.0-26.898 Proportion Seat Belt

Blog Maps


In [20]:
url_type = "blog"
plot_by_topic(df, url_type, nstates=nstates)


BLOG

WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 976.935-1099.051 854.818-976.935 732.701-854.818 610.584-732.701 488.467-610.584 366.35-488.467 244.234-366.35 122.117-244.234 0.0-122.117 Proportion Gun Control
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3345.69-3763.902 2927.479-3345.69 2509.268-2927.479 2091.057-2509.268 1672.845-2091.057 1254.634-1672.845 836.423-1254.634 418.211-836.423 0.0-418.211 Proportion Privacy
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 4374.577-4921.399 3827.755-4374.577 3280.932-3827.755 2734.11-3280.932 2187.288-2734.11 1640.466-2187.288 1093.644-1640.466 546.822-1093.644 0.0-546.822 Proportion Vaccine
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 293.08-329.715 256.445-293.08 219.81-256.445 183.175-219.81 146.54-183.175 109.905-146.54 73.27-109.905 36.635-73.27 0.0-36.635 Proportion Child Education
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 1087.533-1223.474 951.591-1087.533 815.65-951.591 679.708-815.65 543.766-679.708 407.825-543.766 271.883-407.825 135.942-271.883 0.0-135.942 Proportion Skin Damage
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 1087.533-1223.474 951.591-1087.533 815.65-951.591 679.708-815.65 543.766-679.708 407.825-543.766 271.883-407.825 135.942-271.883 0.0-135.942 Proportion Seat Belt

News Maps


In [21]:
url_type = "news"
plot_by_topic(df, url_type, nstates=nstates)


NEWS

WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 5772.798-6447.727 5097.868-5772.798 4422.939-5097.868 3748.009-4422.939 3073.08-3748.009 2398.15-3073.08 1723.221-2398.15 1048.291-1723.221 373.362-1048.291 Proportion Gun Control
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 11273.506-12636.024 9910.988-11273.506 8548.47-9910.988 7185.952-8548.47 5823.434-7185.952 4460.916-5823.434 3098.398-4460.916 1735.88-3098.398 373.362-1735.88 Proportion Privacy
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 6672.41-7462.344 5882.475-6672.41 5092.54-5882.475 4302.606-5092.54 3512.671-4302.606 2722.736-3512.671 1932.802-2722.736 1142.867-1932.802 352.932-1142.867 Proportion Vaccine
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 781.548-879.241 683.854-781.548 586.161-683.854 488.467-586.161 390.774-488.467 293.08-390.774 195.387-293.08 97.693-195.387 0.0-97.693 Proportion Child Education
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 815.65-917.606 713.693-815.65 611.737-713.693 509.781-611.737 407.825-509.781 305.869-407.825 203.912-305.869 101.956-203.912 0.0-101.956 Proportion Skin Damage
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 2065.6-2323.8 1807.4-2065.6 1549.2-1807.4 1291.0-1549.2 1032.8-1291.0 774.6-1032.8 516.4-774.6 258.2-516.4 0.0-258.2 Proportion Seat Belt

Plots which share the colorbar


In [22]:
def plot_map_subplots(df, url_type, decimals=2, nstates=10):
    display(HTML("<h2>{}</h2>".format(url_type.upper())))
    data = []
    COLS = 3
    ROWS = 2

    values_states = []
    total_population = sum(STATE_POPULATIONS.values())
    for i, topic in enumerate(topic_order):
        x = i % COLS
        y = i / COLS
        df_t = df[(df.u_state != "USA") 
                  & (df.t_n_urls > 0)
                  & (df.topic_name == topic)
                 ].assign(**{
        url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
        )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.sum, np.mean, len, np.std]).reset_index()
        df_t = df_t.assign(mean=(df_t["sum"]) / df_t["u_state"].apply(
            lambda k: STATE_POPULATIONS.get(k, total_population)/total_population)
                          ) 


        df_t = df_t[#(df_t["len"] >= (df_t["len"].sort_values().values[-nstates]))
                 (~df_t["u_state"].isin(NON_STATES))
                ]
        values_states.append((
            topic, df_t["mean"].astype(float).values.tolist(),
            df_t["u_state"].values.tolist()
        ))


    #mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
    #mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
    mycolors = ['#ffffd9','#edf8b1','#c7e9b4','#7fcdbb','#41b6c4','#1d91c0','#225ea8','#253494','#081d58']
    mybin = Colorbin(
        sum(map(lambda x: x[1], values_states), []),
        mycolors,
        proportional=True,
        decimals=None
    )
    mybin.set_decimals(3)
    mybin.recalc(fenceposts=True)
    mybin.calc_complements(0.5, '#e0e0e0', '#101010')

    colors_by_state_all = mybin.colors_out
    font_colors_by_state_all = mybin.complements
    legend_colors = mybin.colors_in
    legend_labels = mybin.labels

    curr_idx = 0
    for i, topic in enumerate(topic_order):
        states = values_states[i][2]
        colors_by_state = colors_by_state_all[curr_idx:curr_idx+len(states)]
        font_colors_by_state = font_colors_by_state_all[curr_idx:curr_idx+len(states)]
        curr_idx += len(states)
        cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
        cg.set_title(topic, font_dict={'font-size': 19})
        cg.set_legend(legend_colors, legend_labels, title="Proportion",
                  font_dict={'font-size': '10px', })
        #cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
        cg.draw_multihex(spacing_dict={
            'margin_right': 150,
            'missing_color': '#ffffff',
            'stroke_color': '#000000',
            'stroke_width': 0.1
        }, font_dict={
            'stroke-width': '0.1px',
        }, font_colors=font_colors_by_state)
        cg.done(show=True)

In [23]:
plot_map_subplots(df, url_type="fakenews", decimals=2, nstates=nstates)


FAKENEWS

WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3712.352-4176.396 3248.308-3712.352 2784.264-3248.308 2320.22-2784.264 1856.176-2320.22 1392.132-1856.176 928.088-1392.132 464.044-928.088 0.0-464.044 Proportion Gun Control
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3712.352-4176.396 3248.308-3712.352 2784.264-3248.308 2320.22-2784.264 1856.176-2320.22 1392.132-1856.176 928.088-1392.132 464.044-928.088 0.0-464.044 Proportion Privacy
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3712.352-4176.396 3248.308-3712.352 2784.264-3248.308 2320.22-2784.264 1856.176-2320.22 1392.132-1856.176 928.088-1392.132 464.044-928.088 0.0-464.044 Proportion Vaccine
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3712.352-4176.396 3248.308-3712.352 2784.264-3248.308 2320.22-2784.264 1856.176-2320.22 1392.132-1856.176 928.088-1392.132 464.044-928.088 0.0-464.044 Proportion Child Education
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3712.352-4176.396 3248.308-3712.352 2784.264-3248.308 2320.22-2784.264 1856.176-2320.22 1392.132-1856.176 928.088-1392.132 464.044-928.088 0.0-464.044 Proportion Skin Damage
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 3712.352-4176.396 3248.308-3712.352 2784.264-3248.308 2320.22-2784.264 1856.176-2320.22 1392.132-1856.176 928.088-1392.132 464.044-928.088 0.0-464.044 Proportion Seat Belt

In [24]:
plot_map_subplots(df, url_type="blog", decimals=2, nstates=nstates)


BLOG

WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 4374.577-4921.399 3827.755-4374.577 3280.932-3827.755 2734.11-3280.932 2187.288-2734.11 1640.466-2187.288 1093.644-1640.466 546.822-1093.644 0.0-546.822 Proportion Gun Control
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 4374.577-4921.399 3827.755-4374.577 3280.932-3827.755 2734.11-3280.932 2187.288-2734.11 1640.466-2187.288 1093.644-1640.466 546.822-1093.644 0.0-546.822 Proportion Privacy
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 4374.577-4921.399 3827.755-4374.577 3280.932-3827.755 2734.11-3280.932 2187.288-2734.11 1640.466-2187.288 1093.644-1640.466 546.822-1093.644 0.0-546.822 Proportion Vaccine
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 4374.577-4921.399 3827.755-4374.577 3280.932-3827.755 2734.11-3280.932 2187.288-2734.11 1640.466-2187.288 1093.644-1640.466 546.822-1093.644 0.0-546.822 Proportion Child Education
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 4374.577-4921.399 3827.755-4374.577 3280.932-3827.755 2734.11-3280.932 2187.288-2734.11 1640.466-2187.288 1093.644-1640.466 546.822-1093.644 0.0-546.822 Proportion Skin Damage
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 4374.577-4921.399 3827.755-4374.577 3280.932-3827.755 2734.11-3280.932 2187.288-2734.11 1640.466-2187.288 1093.644-1640.466 546.822-1093.644 0.0-546.822 Proportion Seat Belt

In [25]:
plot_map_subplots(df, url_type="news", decimals=2, nstates=nstates)


NEWS

WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 11232.021-12636.024 9828.018-11232.021 8424.016-9828.018 7020.013-8424.016 5616.011-7020.013 4212.008-5616.011 2808.005-4212.008 1404.003-2808.005 0.0-1404.003 Proportion Gun Control
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 11232.021-12636.024 9828.018-11232.021 8424.016-9828.018 7020.013-8424.016 5616.011-7020.013 4212.008-5616.011 2808.005-4212.008 1404.003-2808.005 0.0-1404.003 Proportion Privacy
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 11232.021-12636.024 9828.018-11232.021 8424.016-9828.018 7020.013-8424.016 5616.011-7020.013 4212.008-5616.011 2808.005-4212.008 1404.003-2808.005 0.0-1404.003 Proportion Vaccine
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 11232.021-12636.024 9828.018-11232.021 8424.016-9828.018 7020.013-8424.016 5616.011-7020.013 4212.008-5616.011 2808.005-4212.008 1404.003-2808.005 0.0-1404.003 Proportion Child Education
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 11232.021-12636.024 9828.018-11232.021 8424.016-9828.018 7020.013-8424.016 5616.011-7020.013 4212.008-5616.011 2808.005-4212.008 1404.003-2808.005 0.0-1404.003 Proportion Skin Damage
WARNING: The following ids in the csv are not included: set(['DC'])
AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY 11232.021-12636.024 9828.018-11232.021 8424.016-9828.018 7020.013-8424.016 5616.011-7020.013 4212.008-5616.011 2808.005-4212.008 1404.003-2808.005 0.0-1404.003 Proportion Seat Belt

Show ratio in each state


In [26]:
def plot_map_subplots(df, url_type, decimals=2):
    display(HTML("<h2>{}</h2>".format(url_type.upper())))
    data = []
    COLS = 3
    ROWS = 2

    values_states = []

    for i, topic in enumerate(topic_order):
        x = i % COLS
        y = i / COLS
        df_t = df[(df.u_state != "USA") 
                  & (df.t_n_urls > 0)
                  & (df.topic_name == topic)
                 ].assign(**{
        url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
        )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()

        df_t = df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
                 & (~df_t["u_state"].isin(NON_STATES))
                ]
        values_states.append((
            topic, df_t["mean"].astype(float).values.tolist(),
            df_t["u_state"].values.tolist()
        ))


    #mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
    mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
    mybin = Colorbin(
        sum(map(lambda x: x[1], values_states), []),
        mycolors,
        proportional=True,
        decimals=None
    )
    mybin.set_decimals(3)
    mybin.recalc(fenceposts=True)
    mybin.calc_complements(0.5, '#e0e0e0', '#101010')

    colors_by_state_all = mybin.colors_out
    font_colors_by_state_all = mybin.complements
    legend_colors = mybin.colors_in
    legend_labels = mybin.labels

    curr_idx = 0
    for i, topic in enumerate(topic_order):
        states = values_states[i][2]
        colors_by_state = colors_by_state_all[curr_idx:curr_idx+len(states)]
        font_colors_by_state = font_colors_by_state_all[curr_idx:curr_idx+len(states)]
        curr_idx += len(states)
        cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
        cg.set_title(topic, font_dict={'font-size': 19})
        cg.set_legend(legend_colors, legend_labels, title="Proportion",
                  font_dict={'font-size': '10px', })
        #cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
        cg.draw_multihex(spacing_dict={
            'margin_right': 150,
            'missing_color': '#ffffff',
            'stroke_color': '#000000',
            'stroke_width': 0.1
        }, font_dict={
            'stroke-width': '0.1px',
        }, font_colors=font_colors_by_state)
        cg.done(show=True)

Analysis


In [27]:
df_topics = {}
for topic in topic_order:
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(
    fakenews=lambda x: x.CATS_Counter.apply(lambda k: k.get('fakenews', 0))
    )[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()
    df_t["value_rank"] = df_t["mean"].rank(ascending=False)
    df_topics[topic] = (df_t[
            (df_t["len"] >= (df_t["len"].sort_values().values[-10]))
            #(df_t["len"] > (df_t["len"].sum() * 0.01)
        ].sort_values("mean",
                                          ascending=False).reset_index().apply(
            lambda x: "%s (%.2f) [%s]" % (
                x["u_state"], x["mean"], x["len"]), axis=1))
pd.concat(df_topics, axis=1, keys=topic_order)


Out[27]:
Gun Control Privacy Vaccine Child Education Skin Damage Seat Belt
0 VA (0.18) [330] FL (0.07) [1252] FL (0.15) [745] DC (0.01) [154] IL (0.01) [236] TX (0.01) [759]
1 FL (0.18) [707] IL (0.07) [742] OH (0.14) [413] CA (0.01) [627] OH (0.01) [183] OH (0.01) [257]
2 TX (0.18) [938] DC (0.06) [1846] TX (0.13) [978] IL (0.01) [173] CA (0.00) [871] CA (0.01) [1404]
3 GA (0.13) [339] PA (0.06) [594] GA (0.08) [436] NY (0.00) [402] AZ (0.00) [236] GA (0.01) [356]
4 PA (0.12) [312] TX (0.06) [1526] CA (0.08) [3507] FL (0.00) [258] FL (0.00) [489] FL (0.01) [661]
5 IL (0.12) [421] NY (0.05) [2563] NY (0.08) [1660] GA (0.00) [138] GA (0.00) [275] PA (0.01) [346]
6 DC (0.12) [429] CA (0.05) [3165] PA (0.07) [432] MO (0.00) [127] NY (0.00) [636] NY (0.01) [965]
7 CA (0.11) [1530] VA (0.04) [715] DC (0.06) [385] NJ (0.00) [123] TX (0.00) [451] IL (0.01) [329]
8 NY (0.09) [1154] MA (0.04) [607] MA (0.06) [447] PA (0.00) [142] NC (0.00) [211] WA (0.00) [305]
9 WA (0.06) [394] WA (0.03) [1050] WA (0.06) [736] TX (0.00) [271] PA (0.00) [216] MI (0.00) [258]

In [28]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    errwidth=2,
                data=df[~df.u_state.isin(NON_STATES)].sort_values("u_state"),
               ax=ax, color="0.7")
    ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Proportion of controversial tweets")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    sns.despine(offset=10)


/homed/content/anaconda3/envs/python2/lib/python2.7/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [29]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    errwidth=2,
                data=df.assign(u_state=df.u_state.fillna("UNK")),
               ax=ax, color="r", order=LOCATION_ORDER)
    ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Proportion of controversial tweets")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    [ax.patches[i].set_color(c) for i, c in enumerate(colors)]
    sns.despine(offset=10)
    plt.setp(ax.get_xticklabels()[:3], rotation=90)



In [30]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["k"]*6
total_controversial = df[(df.is_controversial == 1) & (~df.u_state.isin(NON_STATES))].shape[0] * 1.
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    
                data=df[
            (df.is_controversial == 1)
            & (~df.u_state.isin(NON_STATES))
        ],
               ax=ax, color="0.5",
                    order=LOCATION_ORDER[2:-6],
                    ci=None, estimator=lambda x: len(x)/total_controversial)
    #ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Distribution of controversial tweets\nacross states")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    #[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
    sns.despine(offset=10)
    #plt.setp(ax.get_xticklabels()[:3], rotation=90)



In [31]:
df_t = df[(~df.u_state.isin(NON_STATES)) & (~df.u_state.isnull())].pivot_table(
    index="u_state", columns="topic_name", values="t_id", aggfunc=len)
with sns.plotting_context(
    rc={"axes.titlesize": 10,
        "axes.labelsize": 10,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.PairGrid(df_t.divide(df_t.sum(axis=0), axis=1).reset_index(),
                     x_vars=topic_order, y_vars=["u_state"],
                     size=10, aspect=.25)
    g.map(sns.stripplot, size=10, orient="h",
          color="k", edgecolor="gray")

    # Use the same x axis limits on all columns and add better labels
    g.set(xlabel="proportion", ylabel="",)

    # Use semantically meaningful titles for the columns
    titles = topic_order

    for ax, title in zip(g.axes.flat, titles):

        # Set a different title for each axes
        ax.set(title=title)

        # Make the grid horizontal instead of vertical
        ax.xaxis.grid(False)
        ax.yaxis.grid(True)

    sns.despine(left=True, bottom=True)

# Draw a dot plot using the stripplot function



In [32]:
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    fig, ax = plt.subplots(1,1, figsize=(20,5))
    ax = sns.countplot(df.u_state.fillna("UNK"), color='k', ax=ax, 
                      order=LOCATION_ORDER)
    ax.set_yscale('log')
    ax.set_ylabel('Frequency')
    ax.set_xlabel('Tweet author location')
    plt.xticks(rotation='vertical')
    #sns.despine(offset=2)
    [ax.patches[i].set_color(c) for i, c in enumerate(colors)]



In [33]:
pd.concat([pd.DataFrame(k.reset_index().values, columns=["Location", "Counts"])
    for k in np.array_split(df.u_state.fillna("UNK").value_counts(), 4, axis=0)], axis=1)


Out[33]:
Location Counts Location Counts Location Counts Location Counts
0 UNK 77831 NJ 3402 AL 1449 ID 399
1 CA 22123 AZ 3242 OK 1443 WV 325
2 USA 21114 IN 3222 KY 1368 AK 291
3 NY 14689 MI 3141 CT 1284 MT 289
4 TX 11666 CO 2669 UT 1059 VT 278
5 FL 8913 TN 2622 IA 934 DE 274
6 IL 5331 MD 2431 AR 869 SD 237
7 DC 5150 OR 2171 MS 786 WY 173
8 WA 5093 MO 2002 NE 683 ND 160
9 PA 4653 LA 1951 ME 673 PR 24
10 OH 4601 WI 1813 NH 585 AS 10
11 GA 4377 NV 1693 HI 502 GU 5
12 MA 3913 MN 1675 RI 455 VI 5
13 VA 3796 SC 1539 NM 432 MP 2
14 NC 3568 KS 1484 NaN NaN NaN NaN

In [34]:
df.u_state.describe()


Out[34]:
count     169038
unique        57
top           CA
freq       22123
Name: u_state, dtype: object

In [35]:
df.u_state.shape


Out[35]:
(246869,)

In [36]:
df.groupby("u_id")["u_state"].first().shape, df.groupby("u_id")["u_state"].first().describe()


Out[36]:
((151073,), count     107970
 unique        57
 top           CA
 freq       13251
 Name: u_state, dtype: object)

In [ ]: